

// Author: 			Olivia Kim
// Purpose: 	This file cleans data on college characteristics from the IPEDS/NCES.

// file parameters
local IPEDS_year		2013
local data_raw_IPEDS	"IPEDS_IC_2013.csv"
local data_raw_manual	"IPEDS_IC_manual.xlsx"

*****************************************************************************************************
*****************************************************************************************************
*****************************************************************************************************

*** CLEAN IPEDS DATA ON COLLEGE CHARACTERISTICS ***
		
// load raw IPEDS data  
insheet using "$raw/College/`data_raw_IPEDS'", clear

// rename variables
ren 	unitid	ID_IPEDS
ren 	institutionname	college
ren 	year	year
ren 	hd2013citylocationofinstitution	city
ren 	ic2013calendarsystem	calendar
ren 	drvic2013_rvpercentadmittedtotal	admit_total_pct
ren 	drvic2013_rvpercentadmittedmen	admit_men_pct
ren 	drvic2013_rvpercentadmittedwomen	admit_women_pct
ren 	drvic2013_rvtuitionandfees201314	tuitionFees
ren 	drvic2013_rvtotalpriceforindistr	price_inDistrict
ren 	drvic2013_rvtotalpriceforinstate	price_inState
ren 	drvic2013_rvtotalpriceforoutofst	price_outState
ren 	hd2013stateabbreviation	state_abbrev
ren 	hd2013fipsstatecode	code_fips
ren 	hd2013sectorofinstitution	sector
ren 	hd2013levelofinstitution	years
ren 	hd2013controlofinstitution	ownership

label var 	ID_IPEDS	 "IPEDS ID" 
label var 	college	 "institution name" 
label var 	year	 "academic year" 
label var 	city	 "city where institution is located" 
label var 	calendar	 "academic calendar" 
label var 	admit_total_pct	 "total percent of students admitted" 
label var 	admit_men_pct	 "total percent of male students admitted" 
label var 	admit_women_pct	 "total percent of female students admitted" 
label var 	tuitionFees	 "2013-2014 academic year tuition and fees" 
label var 	price_inDistrict	 "total price for in-district students living on campus 2013-2014" 
label var 	price_inState	 "total price for in-state students living on campus" 
label var 	price_outState	 "total price for out-of-state students living on campus" 
label var 	state_abbrev	 "state abbreviation" 
label var 	code_fips	 "FIPS state code" 
label var 	sector	 "college sector (ownership + level)" 
label var 	years	 "college level of education provided" 
label var 	ownership	 "college ownership" 



// encode
replace calendar ="1" if calendar=="Semester"
replace calendar ="2" if calendar=="Trimester"
replace calendar ="2" if calendar=="Four-one-four plan"
replace calendar ="3" if calendar=="Quarter"
replace calendar ="4" if calendar=="Continuous"
replace calendar ="5" if calendar=="Differs by program"
replace calendar ="5" if calendar=="Other academic year"
replace calendar ="5" if calendar=="Not applicable"
destring calendar, replace

label define cal 	1	"semester"	, replace
label define cal 	2	"trimester"	, add
label define cal 	3	"quarter"	, add
label define cal 	4	"continuous"	, add
label define cal 	5	"other"	, add
label values calendar cal

replace years="0" if years=="unknown"
replace years="1" if years=="Less than 2 years (below associate)"
replace years="2" if years=="2"
replace years="2" if years=="At least 2 but less than 4 years"
replace years="4" if years=="4"
replace years="4" if years=="Four or more years"
destring years, replace
label define yrs 0 "unknown" 1 "less than two-year credential" 2 "two-year college" 4 "four-year college", replace
label values years yrs

replace ownership ="0" if ownership=="Private for-profit"
replace ownership= "1" if ownership=="Private not-for-profit"
replace ownership= "2" if ownership=="Public"
destring ownership, replace
label define pubpriv 0 "private for-profit" 1 "private non-profi" 2 "public", replace 
label values ownership pubpriv

tempfile ipeds
save `ipeds', replace

*************************************
*************************************

// bring in manual data that aren't reported in the crosswalk file
import excel using "${raw}/College/`data_raw_manual.csv'", clear first
drop if ID==.

// encode
replace calendar ="1" if calendar=="Semester"
replace calendar ="2" if calendar=="Trimester"
replace calendar ="2" if calendar=="Four-one-four plan"
replace calendar ="3" if calendar=="Quarter"
replace calendar ="4" if calendar=="Continuous"
replace calendar ="5" if calendar=="Differs by program"
replace calendar ="5" if calendar=="Other academic year"
replace calendar ="5" if calendar=="Not applicable"
destring calendar, replace

label values calendar cal

replace years="0" if years=="unknown"
replace years="1" if years=="Less than 2 years (below associate)"
replace years="2" if years=="2"
replace years="2" if years=="At least 2 but less than 4 years"
replace years="4" if years=="4"
replace years="4" if years=="Four or more years"
destring years, replace
label values years yrs

replace ownership ="0" if ownership=="Private for-profit"
replace ownership= "1" if ownership=="Private not-for-profit"
replace ownership= "2" if ownership=="Public"
destring ownership, replace
label values ownership pubpriv

keep ID_IPEDS college calendar years ownership
tempfile manual
save `manual', replace

use `ipeds', clear
append using `manual'
duplicates tag ID_IPEDS, g(tag)
egen rownonmiss = rownonmiss(*), strok
egen maxnonmiss = max(rownonmiss),by(ID_IPEDS)
drop if rownonmiss < maxnonmiss & tag > 0
drop tag rownonmiss maxnonmiss
isid ID_IPEDS

*************************************
*************************************

*** define college types ***

gen byte college_any         = !missing(college)
gen byte college_public      = (ownership == 2)
gen byte college_private     = (ownership != 2) & !missing(college)
gen byte college_4yr         = (years ==4)
gen byte college_2yr         = (years == 2) | (years == 1)
gen byte college_MA          = (state_abbrev == "Massachusetts"|state_abbrev == "MA")
gen byte college_MA_public   = (college_MA & college_public)
gen byte college_OOS_public	 = (!college_MA & college_public)
gen byte college_out         = (state_abbrev != "Massachusetts" & !missing(college))
gen byte college_umass_amh = ID_IPEDS==166629
gen byte college_umass_bos = ID_IPEDS==166638
gen byte college_umass_dart = ID_IPEDS==167987
gen byte college_umass_low = ID_IPEDS==166513

// label college types
label var college_any		"any post-secondary institution"
label var college_public	"a public college"
label var college_private	"a private college"
label var college_4yr   	"a four-year college"
label var college_2yr		"a two-year or less college"
label var college_MA 		"a college in MA"
label var college_MA_public	"a public college in MA"
label var college_OOS_public "a public college out of MA"
label var college_out		"an out-of-state college"
label var college_umass_amh "UMass Amherst"
label var college_umass_bos "UMass Boston"
label var college_umass_dart "UMass Dartmouth"
label var college_umass_low "UMass Lowell"
// save clean college charateristics
save "${data_clean}/data_colleges", replace 

*Edits Oct 1, 2018
	* Assigning University of Michigan Medical School and Duke University - Fuqua to the undergrad IPEDS
	* Removing FSC-ID from Medical College of Georgia (001579) so that Augusta University can be matched to IPEDS
	* Deleted 487393 Florida Career College-West Palm Beach in IPEDS 2013 bc it had no data. Inserted data in IPEDS manual

	*SRC added Feb 2020

*fill in ipeds with more years of data
* educationdata using "college ipeds institutional-characteristics", csv sub(year=2006:2017) clear //available from 1980, 1984 on
 *save "$raw/College/instchar.dta", replace
*educationdata using "college ipeds directory", csv sub(year=2006:2017) clear //available from 1980, 1984 on
 *save "$raw/College/directory.dta", replace
 use  unitid inst_control institution_level year inst_name state_abbr opeid using "$raw/College/directory.dta", clear
 g college=upper(inst_name)
drop inst_name
g ownership = .
replace ownership = 0 if inst_control ==3
replace ownership = 1 if inst_control==2
replace ownership = 2 if inst_control==1
drop inst_control

g years = .
replace years=institution_l if institution_l>=1
ren state_abbr state_abbrev

*keep most recent years
bys unitid: g first = _n==_N
keep if first==1
drop first year 

ren opeid ID_OPE

tempfile directory 
save "`directory'"

use unitid year calendar using "$raw/College/instchar.dta", clear
g calendar =5 if calendar_s!=. //other
replace calendar = 1 if calendar_s==1 //semester
replace calendar = 2 if calendar_s==3 //trimenster
replace calendar = 3 if calendar_s==2 //quarter
replace calendar = 4 if calendar_s==7 //continuous


bys unitid: g first = _n==_N
keep if first==1
drop first year 

merge 1:1 unitid using "`directory'", nogen



ren unitid ID_IPEDS

gen byte college_any         = !missing(college)
gen byte college_public      = (ownership == 2)
gen byte college_private     = (ownership != 2) & !missing(college)
gen byte college_4yr         = (years ==4)
gen byte college_2yr         = (years == 2) | (years == 1)
gen byte college_MA          = (state_abbrev == "Massachusetts"|state_abbrev == "MA")
gen byte college_MA_public   = (college_MA & college_public)
gen byte college_OOS_public	 = (!college_MA & college_public)
gen byte college_out         = (state_abbrev != "Massachusetts" & !missing(college))
gen byte college_umass_amh = ID_IPEDS==166629
gen byte college_umass_bos = ID_IPEDS==166638
gen byte college_umass_dart = ID_IPEDS==167987
gen byte college_umass_low = ID_IPEDS==166513
// label college types
label var college_any		"any post-secondary institution"
label var college_public	"a public college"
label var college_private	"a private college"
label var college_4yr   	"a four-year college"
label var college_2yr		"a two-year or less college"
label var college_MA 		"a college in MA"
label var college_MA_public	"a public college in MA"
label var college_OOS_public "a public college out of MA"
label var college_out		"an out-of-state college"
label var college_umass_amh "UMass Amherst"
label var college_umass_bos "UMass Boston"
label var college_umass_dart "UMass Dartmouth"
label var college_umass_low "UMass Lowell"
compress
drop state college calendar_s institution_l

save "$data_clean/extra_ipeds.dta", replace

use "${data_clean}/data_colleges.dta", clear
keep ID_IPEDS calendar years ownership college_*

*2013 will be master year for this match, and extra only updates if there are missing 2013 info
merge 1:1 ID  using "$data_clean/extra_ipeds.dta" , update nogen
drop college_4yr college_2yr

*some more edits (Looked up by Selena Cardona Dec 2020, checked by SRC)
egen calendar1 = anymatch(ID_IPEDS), values(138983 138983 139126 140085 140304 140322 140483 140483 140997 364885 366702 440624)
replace calendar = calendar1 if calendar1==1
replace calendar=2 if ID_IPEDS== 248776


ren * ipeds_*
ren ipeds_ID_IPEDS ID_IPEDS
ren ipeds_ID_OPE ID_OPE
save "$data_clean/IPEDS.dta", replace

***************************************************************************************************

*setup chetty -- data downloaded from www.equality-of-opportunity.org/data

*earnings -- all
use super_opeid name tier tier_name k* using "$raw\College\chetty\mrc_table2.dta" , clear
	
*earnings by gender
merge 1:1 super_opeid using "$raw\College\chetty\mrc_table4.dta", keepusing(k*_m k*_f ) nogen

*college characteristics
merge 1:1 super_opeid using "$raw\College\chetty\mrc_table10.dta", nogen

*opeid crosswalk
merge 1:m super_opeid using "$raw\College\chetty\mrc_table11.dta", nogen


foreach v of varlist k_mean k_mean_m k_mean_f  k_q5 k_q5_m k_q5_f{
	qui su `v' if super_opeid==-99
	g `v'_neverattend=r(mean)
	qui su `v' if super_opeid==-1
	g `v'_insuffdata=r(mean)
	}

su k_mean k_mean_m k_mean_f  k_q5 k_q5_m k_q5_f if super_opeid==-99, sep(0) //never attend
 
su k_mean k_mean_m k_mean_f  k_q5 k_q5_m k_q5_f if super_opeid==-9 , sep(0)  //late

su k_mean k_mean_m k_mean_f  k_q5 k_q5_m k_q5_f if super_opeid==-1, sep(0) // insufficient data

drop if opeid==.
save "$data_clean\chetty_college.dta", replace 

*NSC credential file
	import excel using "${raw}\NSC\CREDENTIAL_LEVEL_LOOKUP_TABLE.xlsx", sheet(Credential Lookup_TRAD_20211118) firstrow clear
	rename CREDENTIAL_TITLE_REPORTED_TO_NSC degree_title_name
	duplicates drop
	drop if degree_title_name =="UG CERTIFICATE PROGRAM" & CREDENTIAL_LEVEL_CODE=="AD"
	drop if degree_title_name =="POST MASTERS" & CREDENTIAL_LEVEL_CODE=="MD"
	save "${data_clean}\nsc_credential_table.dta", replace


*** save tempfiles that need to be merged ***

// keep FSC-IPEDS ID crosswalk to merge in institutional characteristics data 
import excel "${raw}/College/college_crosswalk.xls", firstrow clear
keep ID_FSC ID_FSC_long ID_IPEDS ID_OPE name
ren name college_name
replace ID_FSC_long=trim(ID_FSC_long)
replace ID_FSC_long = trim(ID_FSC) + "-00" if !missing(ID_FSC) & missing(ID_FSC_long)
drop ID_FSC 
keep if ID_IPEDS!=.
duplicates drop ID_FSC_long, force
drop if missing(ID_FSC_long)
rename ID_FSC_long ID_FSC
preserve
drop college_name
save "$data_clean/FSC.dta", replace
restore
keep ID_FSC college_name
g opeid= substr(ID_FSC,1,6)
destring opeid, replace force
drop if opeid==.
g opeid_AA = opeid
g opeid_BA = opeid
g opeid_firstinst =opeid
g college_name_AA = college_name
g college_name_BA = college_name
g college_name_firstinst =college_name
drop opeid college_name
duplicates drop
*remaining dups are different campuses of same school
duplicates drop opeid_BA, force
save "$data_clean/collegenames.dta", replace

